import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
import plotly
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#print(__version__) # requires version >= 1.9.0
# For Notebooks
init_notebook_mode(connected=True)
# For offline use
cf.go_offline()
%matplotlib inline
school_frame = pd.read_csv('2016 School Explorer.csv')
shsat_frame = pd.read_csv("D5 SHSAT Registrations and Testers.csv")
pd.set_option('display.max_columns', None)
school_frame.head()
school_frame.shape
school_frame.columns
# Function converting string % values to int
def percent_to_int(df_in):
for col in df_in.columns.values:
if col.startswith("Percent") or col.endswith("%") or col.endswith("Rate"):
df_in[col] = df_in[col].astype(np.object).str.replace('%', '').astype(float)
return df_in
school_frame = percent_to_int(school_frame)
#Filtering out non Middle Schools
school_frame_raw = school_frame[school_frame['Grade High'] != '0K']
#school_frame_relevant_grade = school_frame[school_frame['Grade High'].astype(int) > 5]
#Cleaning School Income Estimate
school_frame_raw['School Income Estimate'] = school_frame['School Income Estimate'].astype(np.object).str.replace('$', '').str.replace(',', '').str.replace('.', '').astype(float)
#Removing >95% null columns
high_nan_columns = school_frame_raw.columns[school_frame_raw.isnull().mean()>0.95]
#school_frame = school_frame_relevant_grade.drop(high_nan_columns, axis=1)
#print(list(high_bnan_columns))
#school_frame_raw contains all grades while school_frame contains SHSAT eligible middle schools
school_frame_relevant_grades = school_frame_raw[school_frame_raw['Grade High'].astype(int) > 5]
school_frame = school_frame_relevant_grades.drop(high_nan_columns, axis=1)
#Filling null values which prevent calculations
school_frame['School Income Estimate'] = school_frame['School Income Estimate'].fillna(0)
school_frame['Economic Need Index'] = school_frame['Economic Need Index'].fillna(0)
print("We have %d relevant schools and %d fields describing the school/ students"%(school_frame.shape))
#Distributions of schools based on city
schools_by_city = school_frame['City'].value_counts()
schools_by_city.iplot(kind='bar',yTitle='Number of SHSAT Eligible Schools', title='SHSAT Eligible Schools by City', filename='cufflinks/categorical-bar-chart')
#Analysis of Chronically Absent Students
school_frame = school_frame.dropna(subset = ['Percent of Students Chronically Absent'])
temp = sns.distplot(school_frame[['Percent of Students Chronically Absent']].values, kde=False)
temp = plt.title('Distribution of Schools Based on Chronically Absent Students')
temp = plt.xlabel("Percent of students")
temp = plt.ylabel("Count")
chronically_absent_foi = ['School Name', 'Percent of Students Chronically Absent','Percent Black / Hispanic','Economic Need Index']
df_schools_ca = school_frame[school_frame['Percent of Students Chronically Absent'] > 40]
df_schools_nca = school_frame[school_frame['Percent of Students Chronically Absent'] < 5]
df_schools_ca[chronically_absent_foi].sort_values('Percent of Students Chronically Absent', ascending = False)
plt.figure(figsize = [16,7])
plt.suptitle('Statistics for Schools with High Chronically Absent Percentage', fontsize=15)
plt.subplot(1,2,1)
temp = sns.distplot(df_schools_ca[['Economic Need Index']].values, kde=False)
temp = plt.xlabel("Economic Need Index", fontsize=15)
temp = plt.ylabel("School count", fontsize=15)
plt.subplot(1,2,2)
temp = sns.distplot(df_schools_ca[['Percent Black / Hispanic']].values, kde=False)
temp = plt.xlabel("Percent Black / Hispanic", fontsize=15)
temp = plt.ylabel("School count", fontsize=15)
data = [
{
'x': school_frame["Longitude"],
'y': school_frame["Latitude"],
'text': school_frame["School Name"],
'mode': 'markers',
'marker': {
'color': school_frame["Economic Need Index"],
'size': school_frame["School Income Estimate"]/4500,
'showscale': True,
'colorscale':'Portland'
}
}
]
layout= go.Layout(
title= 'New York School Population (Economic Need Index)',
xaxis= dict(
title= 'Longitude'
),
yaxis=dict(
title='Latitude'
))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels.html')
#Black Population
data = [
{
'x': school_frame["Longitude"],
'y': school_frame["Latitude"],
'text': school_frame["School Name"],
'mode': 'markers',
'marker': {
'color': school_frame["Percent Black"],
'size': school_frame["School Income Estimate"]/4500,
'showscale': True,
'colorscale':'Portland'
}
}
]
layout= go.Layout(
title= 'New York Black Student Ratio Of School',
xaxis= dict(
title= 'Longitude'
),
yaxis=dict(
title='Latitude'
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')
#Hispanic Population
data = [
{
'x': school_frame["Longitude"],
'y': school_frame["Latitude"],
'text': school_frame["School Name"],
'mode': 'markers',
'marker': {
'color': school_frame["Percent Hispanic"],
'size': school_frame["School Income Estimate"]/4500,
'showscale': True,
'colorscale':'Portland'
}
}
]
layout= go.Layout(
title= 'New York Hispanic Student Ratio Of School',
xaxis= dict(
title= 'Longitude'
),
yaxis=dict(
title='Latitude'
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')
#combination of black and latino
#bl_frame = school_frame["Percent Hispanic"] + school_frame["Percent Black"]
#Hispanic Population
data = [
{
'x': school_frame["Longitude"],
'y': school_frame["Latitude"],
'text': school_frame["School Name"],
'mode': 'markers',
'marker': {
'color': school_frame["Percent Black / Hispanic"],
'size': school_frame["School Income Estimate"]/4500,
'showscale': True,
'colorscale':'Portland'
}
}
]
layout= go.Layout(
title= 'New York Black/Hispanic Student Ratio Of School',
xaxis= dict(
title= 'Longitude'
),
yaxis=dict(
title='Latitude'
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')
lower_bound = school_frame['Economic Need Index'].quantile(0.75)
high_eni_schools = school_frame[school_frame['Economic Need Index'] > lower_bound]
low_eni_schools = school_frame[school_frame['Economic Need Index'] < 0.25]
high_eni_schools[['School Name', 'Economic Need Index', 'Percent Black / Hispanic']].sort_values('Economic Need Index', ascending = False).head(10)
print ("Average ELA / Math performance for the high ENI schools")
high_eni_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
print ("Average ELA / Math performance for the low ENI schools")
low_eni_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
#After school programs
#Start by looking at which schools have the lowest income estimates.
income_estimate = school_frame[['School Name','School Income Estimate']].sort_values(by='School Income Estimate',ascending=True)
#There are several null columns here.. this makes it difficult to use this category as a method of predicting need
null_columns=income_estimate.columns[income_estimate.isnull().any()]
income_estimate[null_columns].isnull().sum()
#Lets see how many null columns in the entire school frame
#There is a considerable amount missing (31%) in the School Income Estimate, leading me to want to avoid using it
null_columns=school_frame.columns[school_frame.isnull().any()]
school_frame[null_columns].isnull().sum()
#Lets see how many null columns in the entire school frame
#Conclusion: Doesn't seem to be any
null_columns=shsat_frame.columns[shsat_frame.isnull().any()]
shsat_frame[null_columns].isnull().sum()
column_list = school_frame.columns.values
for i,a in enumerate(column_list):
print('%s is %d' % (a, i))
#Correlation matrix of school frame
school_corr = school_frame.iloc[:,[13,15,16,17,18,19,20,21,22,36,37]]
corr = school_corr.corr()
corr = (corr)
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,
cmap="RdYlGn", center=0, annot=True)
#plt.savefig('img/correlation.png',bbox_inches='tight')
#Correlation matrix of school frame
school_corr = school_frame.iloc[:,[13,14,15,16,17,18,20,21,22]]
corr = school_corr.corr()
corr = (corr)
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,
cmap="RdYlGn", center=0, annot=True)
#Test Prep and Tutoring
minority_frame = school_frame_raw[school_frame_raw['Percent Black / Hispanic'].astype(int)>70]
print(minority_frame.shape)
def grade_minority_percent_4s (df, grade, subject):
out_field = ('Grade %d %s Minority 4s')%(grade, subject)
num1 = ('Grade %d %s 4s - Black or African American')%(grade, subject)
num2 = ('Grade %d %s 4s - Hispanic or Latino')%(grade, subject)
den = ('Grade %d %s 4s - All Students')%(grade, subject)
df = df[df[den].astype(int)>5]
df[out_field] = (df[num1] + df[num2])/(df[den])
grade_minority_df = df[df[out_field]>0.7]
print(grade, subject, df.shape, grade_minority_df.shape, (grade_minority_df[num1] + grade_minority_df[num2]).mean(), df[num1].mean(), df[num2].mean())
return df
import itertools
grades = [5,6,7]
subjects = ['ELA', 'Math']
i=1
plt.figure(figsize=[19,10])
df_all_schools = pd.DataFrame(columns=minority_frame.columns)
for grade,subject in itertools.product(grades, subjects):
df_schools_tmp = grade_minority_percent_4s(minority_frame, grade, subject)
# df_schools_tmp[]
fname = ('Grade %d %s Minority 4s')%(grade, subject)
plt.subplot(3,2,i)
df_schools_tmp[fname].hist()
df_schools_tmp.drop([fname], axis=1, inplace=True)
plt.title(fname)
df_all_schools = pd.concat([df_all_schools, df_schools_tmp])
i += 1
plt.savefig('img/minority4s.png',bbox_inches='tight')
#SHSAT Analysis
shsat_frame.head()
shsat_frame['Took Percentage'] = shsat_frame['Number of students who took the SHSAT'] / shsat_frame['Number of students who registered for the SHSAT']
shsat_sorted = shsat_frame.groupby(['DBN','School name'], as_index=False).sum()
shsat_sorted.head()
shsat_sorted = shsat_frame.groupby(['DBN','School name'], as_index=False).sum()
shsat_sorted = shsat_sorted[['DBN','School name',
'Enrollment on 10/31',
'Number of students who registered for the SHSAT',
'Number of students who took the SHSAT']].join(shsat_frame[['Year of SHST',
'Grade level']], how='inner')
shsat_sorted = shsat_sorted.sort_values('Number of students who registered for the SHSAT',ascending=False)
shsat_sorted.head()
shsat_sorted['Took Percentage'] = shsat_sorted['Number of students who took the SHSAT'] / shsat_sorted['Number of students who registered for the SHSAT']
sns.set(style="whitegrid")
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10))
# Plot the total schools per city
sns.set_color_codes("pastel")
sns.barplot(x="Number of students who registered for the SHSAT", y='School name', data=shsat_sorted,
label="# of SHSAT Registrations", color="b", ci=None)
# Plot the total community schools per city
sns.set_color_codes("muted")
sns.barplot(x="Number of students who took the SHSAT", y="School name", data=shsat_sorted,
label="# of Students who Took SHSAT", color="b",ci=None)
# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 300), ylabel="School Name", title='SHSAT School Registration Distribution',
xlabel="# of Registrations")
sns.despine(left=True, bottom=True)
plt.savefig('img/shsatregdist.png',bbox_inches='tight')
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9))
sns.barplot(y='School name', x='Took Percentage', data=shsat_sorted.sort_values('Took Percentage', ascending = False), ax=ax, orient='h', ci=None, color='b')
shsat_low_appearance_df = shsat_sorted[shsat_sorted['Took Percentage'] <= 0.4]
shsat_low_appearance_df.sort_values('Took Percentage')
poor_schools_shsat_performance = list(np.unique(shsat_sorted['DBN']))
# poor_schools_shsat_performance = [i.strip().lower() for i in poor_schools_shsat_performance]
total_schools = list(set(school_frame['Location Code']))
# total_schools = [i.strip().lower() for i in total_schools]
intersection = list(set(poor_schools_shsat_performance).intersection(set(total_schools)))
print('Number of Intersecting schools = {}'.format(len(intersection)))
percent_black_hist = []
for school in intersection:
percent_black_hist.append(list(school_frame[school_frame['Location Code'] == school]['Percent Black / Hispanic'])[0])
plt.figure(figsize=(10,5))
sns.distplot(percent_black_hist, kde=False)
temp = plt.title('Percent Black / Hispanic in low SHSAT-taking schools', fontsize=15)
temp = plt.xlabel("Percent Black / Hispanic", fontsize=15)
temp = plt.ylabel("Number of Schools", fontsize=15)
plt.savefig('img/percentbhshsat.png',bbox_inches='tight')
#As seen from the histogram above, the percentage of Black and Hispanic students is high where the percent of students taking the exam after registration is low.
#After School Programs 2.0 - School Performance Index
features_list = ['Rigorous Instruction %',
'Collaborative Teachers %',
'Supportive Environment %',
'Effective School Leadership %',
'Strong Family-Community Ties %',
'Trust %']
school_frame[['School Name'] + features_list ].head()
#Looking at correlation of these features
school_frame[features_list].corr()
#From this we can see how correlated these features are to eachother
corr = school_frame[features_list].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap='RdYlGn')
temp = plt.xticks(rotation=75, fontsize=15)
temp = plt.yticks(fontsize=15)
plt.savefig('img/schoolperformance.png',bbox_inches='tight')
#The highest correlation is between Effective School Leadership, Collaborative Teachers and, Trust
#Principal Component Anaysis (PCA)' has been applied on the 3 features to get a single combined feature capturing the key characteristics of all those three features.
from sklearn.decomposition import PCA
correlated_features_list = ["Effective School Leadership %","Collaborative Teachers %","Trust %"]
corr_features_values = school_frame[correlated_features_list].values
pca = PCA(n_components=1)
combined_feature_value = pca.fit_transform(corr_features_values)
school_frame['PCA Combined Feature'] = combined_feature_value
#df_schools[correlated_features_list + ['PCA Combined Feature']].corr()
import sklearn
scaler = sklearn.preprocessing.MinMaxScaler()
scale_factor = 2*(school_frame['PCA Combined Feature'].corr(school_frame["Effective School Leadership %"])>0) -1
school_frame['PCA Combined Feature'] = scaler.fit_transform(scale_factor * school_frame['PCA Combined Feature'].values.reshape(-1,1))*100
print ("The correlation between the three correlated features and their PCA is shown below:")
school_frame[correlated_features_list + ['PCA Combined Feature']].corr()
#Weights to each factor for calculating School Performace Indicator:
#1.0 := Supportive Environment %
#0.8 := Rigorous Instruction %
#0.7 := PCA combined feature %
#0.5 := Strong Family-Community Ties %
#NOTE: PCA combined features incorporates these three features - Effective School Leadership %, Collaborative Teachers %, Trust %
features = ['Rigorous Instruction %','Supportive Environment %','PCA Combined Feature',
'Strong Family-Community Ties %']
weights = [0.8, 1, 0.7, 0.5]
school_frame['SPI'] = school_frame[features].dot(weights)
print ("A glimpse of the School Performance Index (SPI) :")
school_frame[features+['SPI']].head(5)
#Separting schools into low and high performance groups
df_low_spi_schools = school_frame[school_frame['SPI'] < school_frame['SPI'].quantile(.25)]
df_high_spi_schools = school_frame[school_frame['SPI'] > school_frame['SPI'].quantile(.25)]
#fig = plot_city_hist(df_low_spi_schools, 'Distribution of low SPI schools by city')
#iplot(fig)
#We can see that there is high disparity in the ELA/Math proficiency amongst school students with high/low SPI.
print ("Average ELA / Math performance for the high SPI schools")
df_high_spi_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
print ("Average ELA / Math performance for the low SPI schools")
df_low_spi_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
import base64
from IPython.display import HTML
def download_link(df, filename = "data.csv"):
csv = df.to_csv()
b64 = base64.b64encode(csv.encode())
payload = b64.decode()
title = "Download CSV file"
html = '<button type="button" style="font-size: larger; background-color: #FFFFFF; border: 0pt;"><a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a></button>'
html = html.format(payload=payload,title=title,filename=filename)
return HTML(html)
dfn = df_low_spi_schools.drop_duplicates()
print(dfn.shape)
download_link(dfn, 'low_spi_schools.csv')
dfn = df_all_schools.drop_duplicates()
print(dfn.shape)
download_link(dfn, 'top_performing_students.csv')